import pandas as pd
import json
from openai import OpenAI
from pathlib import Path
import time
from typing import Dict, List
import sys

# Configure LM Studio connection
client = OpenAI(
    base_url="http://127.0.0.1:1234/v1",
    api_key="not-needed"
)

# Define models to use (excluding the embedding model)
MODELS = [
    "qwen/qwen3-vl-4b",
    "ibm/granite-3.2-8b",
    "google/gemma-3n-e4b",
    "meta-llama-3-8b-instruct"
]

def analyze_violence(comment_text: str, model_name: str) -> Dict:
    """
    Analyze a single comment for violence content using the specified model.
    """
    prompt = f"""Analyze the following social media comment for violence-related content.

COMMENT:
{comment_text}

Respond ONLY with valid JSON in this exact format (no additional text):
{{
    "discusses_violence": true/false,
    "frequency_score": 0-10,
    "brief_explanation": "one sentence explanation"
}}

Where:
- discusses_violence: Does the comment discuss violence or a violent episode? (true/false)
- frequency_score: How frequent is the reference to violence from 0 (not at all) to 10 (predominant)
- brief_explanation: Brief reason for your assessment

Respond with ONLY the JSON object, nothing else."""

    try:
        response = client.chat.completions.create(
            model=model_name,
            messages=[
                {"role": "system", "content": "You are a research assistant analyzing social media comments. Always respond with valid JSON only."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.2,
            max_tokens=200
        )
        
        result_text = response.choices[0].message.content.strip()
        
        # Try to extract JSON if there's extra text
        if result_text.startswith('```json'):
            result_text = result_text.split('```json')[1].split('```')[0].strip()
        elif result_text.startswith('```'):
            result_text = result_text.split('```')[1].split('```')[0].strip()
        
        # Parse JSON
        result = json.loads(result_text)
        result['model_used'] = model_name
        
        return result
        
    except json.JSONDecodeError as e:
        print(f"  Warning: JSON parsing error for {model_name}")
        return {
            "discusses_violence": None,
            "frequency_score": None,
            "brief_explanation": f"Error parsing response: {str(e)}",
            "raw_response": result_text,
            "model_used": model_name
        }
    except Exception as e:
        print(f"  Error with {model_name}: {str(e)}")
        return {
            "discusses_violence": None,
            "frequency_score": None,
            "brief_explanation": f"API Error: {str(e)}",
            "model_used": model_name
        }

def process_csv_with_model(input_file: str, model_name: str, output_dir: str = "multi_model_results") -> pd.DataFrame:
    """
    Process a CSV file with a specific model.
    """
    # Create output directory
    Path(output_dir).mkdir(exist_ok=True)
    
    # Create model-specific subdirectory
    model_safe_name = model_name.replace('/', '_').replace('-', '_')
    
    print(f"\n{'='*60}")
    print(f"Model: {model_name}")
    print(f"File: {input_file}")
    print(f"{'='*60}")
    
    # Read CSV
    try:
        df = pd.read_csv(input_file)
        print(f"✓ Loaded {len(df)} comments")
    except Exception as e:
        print(f"✗ Error reading file: {e}")
        return None
    
    # Verify required columns
    if 'comment_id' not in df.columns or 'text_original' not in df.columns:
        print("✗ Error: CSV must have 'comment_id' and 'text_original' columns")
        return None
    
    # Initialize result columns with model-specific names
    df[f'discusses_violence_{model_safe_name}'] = None
    df[f'violence_score_{model_safe_name}'] = None
    df[f'explanation_{model_safe_name}'] = None
    
    # Process each comment
    for idx, row in df.iterrows():
        comment_id = row['comment_id']
        text = row['text_original']
        
        # Skip empty comments
        if pd.isna(text) or str(text).strip() == '':
            print(f"  [{idx+1}/{len(df)}] Skipping empty comment {comment_id}")
            continue
        
        print(f"  [{idx+1}/{len(df)}] Comment {comment_id}...", end=' ')
        
        # Analyze comment
        result = analyze_violence(text, model_name)
        
        # Store results
        df.at[idx, f'discusses_violence_{model_safe_name}'] = result.get('discusses_violence')
        df.at[idx, f'violence_score_{model_safe_name}'] = result.get('frequency_score')
        df.at[idx, f'explanation_{model_safe_name}'] = result.get('brief_explanation')
        
        # Store raw response if there was an error
        if 'raw_response' in result:
            if f'raw_response_{model_safe_name}' not in df.columns:
                df[f'raw_response_{model_safe_name}'] = None
            df.at[idx, f'raw_response_{model_safe_name}'] = result.get('raw_response')
        
        print(f"Violence: {result.get('discusses_violence')}, Score: {result.get('frequency_score')}")
        
        # Small delay
        time.sleep(0.1)
    
    # Save results for this model
    input_stem = Path(input_file).stem
    output_file = Path(output_dir) / f"{input_stem}_{model_safe_name}.csv"
    df.to_csv(output_file, index=False)
    print(f"\n✓ Results saved to: {output_file}")
    
    # Print summary
    violence_col = f'discusses_violence_{model_safe_name}'
    score_col = f'violence_score_{model_safe_name}'
    
    print(f"\nSUMMARY for {model_name}:")
    print(f"  Comments with violence: {df[violence_col].sum()}")
    print(f"  Average violence score: {df[score_col].mean():.2f}")
    print(f"  Max violence score: {df[score_col].max()}")
    
    return df

def process_csv_with_all_models(input_file: str, models: List[str] = None, output_dir: str = "multi_model_results"):
    """
    Process a single CSV file with multiple models and create a comparison file.
    """
    if models is None:
        models = MODELS
    
    print(f"\n{'#'*60}")
    print(f"MULTI-MODEL PROCESSING")
    print(f"File: {input_file}")
    print(f"Models: {len(models)}")
    print(f"{'#'*60}")
    
    all_results = []
    
    # Process with each model
    for model in models:
        result_df = process_csv_with_model(input_file, model, output_dir)
        if result_df is not None:
            all_results.append(result_df)
    
    # Create combined comparison file
    if all_results:
        # Start with the base dataframe (comment_id and text_original)
        base_df = pd.read_csv(input_file)[['comment_id', 'text_original']]
        
        # Add columns from each model
        for result_df in all_results:
            for col in result_df.columns:
                if col not in ['comment_id', 'text_original']:
                    base_df[col] = result_df[col]
        
        # Save combined file
        input_stem = Path(input_file).stem
        combined_file = Path(output_dir) / f"{input_stem}_ALL_MODELS_COMPARISON.csv"
        base_df.to_csv(combined_file, index=False)
        
        print(f"\n{'='*60}")
        print(f"✓ Combined comparison file saved to: {combined_file}")
        print(f"{'='*60}")
        
        # Overall comparison summary
        print(f"\nMODEL COMPARISON SUMMARY:")
        print(f"{'Model':<35} {'Violence %':<15} {'Avg Score':<10}")
        print(f"{'-'*60}")
        
        for model in models:
            model_safe_name = model.replace('/', '_').replace('-', '_')
            violence_col = f'discusses_violence_{model_safe_name}'
            score_col = f'violence_score_{model_safe_name}'
            
            if violence_col in base_df.columns:
                violence_pct = (base_df[violence_col].sum() / len(base_df)) * 100
                avg_score = base_df[score_col].mean()
                print(f"{model:<35} {violence_pct:>6.1f}%        {avg_score:>6.2f}")
        
        return base_df
    
    return None

def main():
    """
    Main function to handle command line arguments.
    """
    if len(sys.argv) < 2:
        print("Usage: python multi_model_labeling.py <input_csv_file> [output_dir]")
        print("\nExample:")
        print("  python multi_model_labeling.py comments_batch_01.csv")
        print("  python multi_model_labeling.py comments_batch_01.csv my_results")
        print(f"\nWill process with these models:")
        for model in MODELS:
            print(f"  - {model}")
        sys.exit(1)
    
    input_file = sys.argv[1]
    output_dir = sys.argv[2] if len(sys.argv) > 2 else "multi_model_results"
    
    if not Path(input_file).exists():
        print(f"Error: File not found: {input_file}")
        sys.exit(1)
    
    # Process with all models
    process_csv_with_all_models(input_file, MODELS, output_dir)
    
    print(f"\n{'#'*60}")
    print("PROCESSING COMPLETE!")
    print(f"All results saved to: {output_dir}/")
    print(f"{'#'*60}")

if __name__ == "__main__":
    main()
